QUESTION #1: Load, adapt the data and create metadata - Load the four different replicates provided as separate files into separate variables. - Check data is correctly loaded, and all samples have the same dimensions. - Remember to adapt the data accordingly. We need columns of the tables to represent variables (genes) and rows to be samples. Also, we need to merge all tables of observations. Check all samples are in the same order. - Create a meta data dataframe containing information regarding replicate, strains and time points. Information is either included in the sample name or in the introduction to the dataset stated above.
# --------------------------
## [#1] Read the data
# --------------------------
R1_data <- read.csv("data/rep1.csv", row.names = 1)
R2_data <- read.csv("data/rep2.csv", row.names = 1)
R3_data <- read.csv("data/rep3.csv", row.names = 1)
R4_data <- read.csv("data/rep4.csv", row.names = 1)
# Observe the data:
head(R1_data[,1:5])
## A.t1_rep1 A.t2_rep1 A.t3_rep1 A.t4_rep1 A.t5_rep1
## YDL248W 5026 5634 4964 5108 5552
## YDL247W-A 52 28 42 24 48
## YDL247W 1264 1286 1158 1116 1252
## YDL246C 822 854 814 826 922
## YDL245C 886 816 888 824 836
## YDL244W 900 1040 974 982 946
head(R2_data[,1:5])
## A.t1_rep2 A.t2_rep2 A.t3_rep2 A.t4_rep2 A.t5_rep2
## YDL248W 12535 14050 12430 12760 13885
## YDL247W-A 140 75 130 105 160
## YDL247W 3185 3200 2935 2765 3130
## YDL246C 2045 2105 1970 2110 2255
## YDL245C 2240 2010 2185 2080 2080
## YDL244W 2240 2610 2440 2480 2410
head(R3_data[,1:5])
## A.t1_rep3 A.t2_rep3 A.t3_rep3 A.t4_rep3 A.t5_rep3
## YDL248W 25090 28150 24870 25560 27740
## YDL247W-A 270 150 280 230 290
## YDL247W 6370 6420 5880 5540 6280
## YDL246C 4120 4270 4060 4240 4660
## YDL245C 4490 4040 4420 4170 4200
## YDL244W 4590 5210 4880 4980 4840
head(R4_data[,1:5])
## A.t1_rep4 A.t2_rep4 A.t3_rep4 A.t4_rep4 A.t5_rep4
## YDL248W 50240 56340 49820 51160 55540
## YDL247W-A 580 360 660 540 620
## YDL247W 12800 12920 11840 11160 12620
## YDL246C 8320 8620 8180 8560 9380
## YDL245C 9040 8140 8880 8420 8440
## YDL244W 9220 10460 9820 10060 9760
Check for duplicates in our data:
length(colnames(R1_data)) == length(colnames(R2_data)) && length(colnames(R2_data)) == length(colnames(R3_data)) && length(colnames(R3_data)) == length(colnames(R4_data))
## [1] TRUE
# --------------------------
## [#2] Adapt the data
# --------------------------
X1 <- t(R1_data)
X2 <- t(R2_data)
X3 <- t(R3_data)
X4 <- t(R4_data)
## Merge
X <- rbind(X1, X2, X3, X4)
## Check if all samples are in the same order
all(rownames(R1_data) == rownames(R2_data) & rownames(R2_data) == rownames(R3_data) & rownames(R3_data) == rownames(R4_data))
## [1] TRUE
## Check the data:
dim(X1)
## [1] 12 7126
dim(X2)
## [1] 12 7126
dim(X3)
## [1] 12 7126
dim(X4)
## [1] 12 7126
dim(X)
## [1] 48 7126
# --------------------------
## [#3] Load the metadata
# --------------------------
metadata <- data.frame(
row.names = row.names(X),
Strain = gsub("(.*)\\.t[0-9]+_rep[0-9]+", "\\1", rownames(X)),
Time_Point = as.factor(gsub(".*\\.t([0-9]+)_rep[0-9]+", "\\1", rownames(X))),
Replicate = as.factor(gsub(".*_rep([0-9]+)", "\\1", rownames(X))),
Total_Expression = rowSums(X)
)
metadata$Strain <- as.factor(metadata$Strain)
head(metadata)
## Strain Time_Point Replicate Total_Expression
## A.t1_rep1 A 1 1 40451322
## A.t2_rep1 A 2 1 44874734
## A.t3_rep1 A 3 1 39690864
## A.t4_rep1 A 4 1 41991024
## A.t5_rep1 A 5 1 41959830
## A.t6_rep1 A 6 1 40839142
QUESTION #2: PCA representation - Represent the data with a PCA projection: non-scaled, scaled, normalized. - Argue whether batch effects are present between the 4 replicates. - Remove outliers if necessary. - Show the different steps in the process and comparison with raw-data and normalized. - Scale or normalize data appropriately. - Identify and show any correlation with expression if necessary.
# Upload libraries
library(ggfortify)
library(factoextra)
library(dplyr)
library(umap)
## PCA NOT SCALED
pca_res <- prcomp(X, scale=FALSE)
# Basic plot
plot(pca_res$x)
fviz_pca_ind(pca_res, geom.ind = "point", col.ind = metadata$Strain, addEllipses = TRUE)
fviz_pca_ind(pca_res, geom.ind = "point", col.ind = metadata$Replicate, addEllipses = TRUE)
fviz_pca_ind(pca_res, geom.ind = "point", col.ind = metadata$Time_Point, addEllipses = TRUE)
autoplot(pca_res, data=metadata, colour="Strain", title = "PCA - Color by Strain") + theme_classic()
autoplot(pca_res, data=metadata, colour="Replicate", title = "PCA - Color by Replicate") + theme_classic()
autoplot(pca_res, data=metadata, colour="Time_Point", title = "PCA - Color by Time Point") + theme_classic()
plot(pca_res$x, pch=19, col=metadata$Strain, main = "PCA - Color by Strain", xlab = "PC1", ylab = "PC2")
legend("topright", legend = levels(metadata$Strain), col = unique(metadata$Strain), pch = 19)
plot(pca_res$x, pch=19, col=metadata$Replicate, main = "PCA - Color by Replicate", xlab = "PC1", ylab = "PC2")
legend("topright", legend = levels(metadata$Replicate), col = unique(metadata$Replicate), pch = 19)
plot(pca_res$x, pch=19, col=metadata$Time_Point, main = "PCA - Color by Time Point", xlab = "PC1", ylab = "PC2")
legend("topright", legend = levels(metadata$Time_Point), col = unique(metadata$Time_Point), pch = 19)
Firstly, we have done different plots using a PCA projection without scaled data. We have colored each plot with a different variable to observe how the PCA differs between Strains, Replicates and Time Points. We observe a trend where samples are distributed from the middle zone of PC2 and bifurcates in diagonal between replicates, being A PC2 positive and B PC2 negative. We are able to observe some clusters between the same Strain, Replicate and Time Point, even though we have an outlier of Strain B, Replicate 3 and Time Point 3. This outlier is situated near the 0.0 value of PC2.
fviz_eig(pca_res, addlabels = TRUE, main = "Principal components: Variance explained")
pca_scaled <- prcomp(X, scale = TRUE)
fviz_eig(pca_scaled, addlabels = TRUE, main = "Principal components: Variance explained: PCA scaled")
We observe a slightly difference between the principal component variance of not scaled and scaled. Basically, when we have not scale all the dimensions from 4 to 10 are equal to 0%, while in the case of scale the value is increased a little bit. This percentage increase is taken out from dimension 1 and 2 and spread through the other dimensions in decreasing order.
PCA SCALED
plot(pca_scaled$x)
fviz_pca_ind(pca_scaled, geom.ind = "point", col.ind = metadata$Strain, addEllipses = TRUE)
fviz_pca_ind(pca_scaled, geom.ind = "point", col.ind = metadata$Replicate, addEllipses = TRUE)
fviz_pca_ind(pca_scaled, geom.ind = "point", col.ind = metadata$Time_Point, addEllipses = TRUE)
autoplot(pca_scaled, data=metadata, colour="Strain", title = "PCA - Color by Strain") + theme_classic()
autoplot(pca_scaled, data=metadata, colour="Replicate", title = "PCA - Color by Replicate") + theme_classic()
autoplot(pca_scaled, data=metadata, colour="Time_Point", title = "PCA - Color by Time Point") + theme_classic()
plot(pca_scaled$x, pch=19, col=metadata$Strain, main = "PCA - Color by Strain", xlab = "PC1", ylab = "PC2")
legend("topright", legend = levels(metadata$Strain), col = unique(metadata$Strain), pch = 19)
plot(pca_scaled$x, pch=19, col=metadata$Replicate, main = "PCA - Color by Replicate", xlab = "PC1", ylab = "PC2")
legend("topright", legend = levels(metadata$Replicate), col = unique(metadata$Replicate), pch = 19)
plot(pca_scaled$x, pch=19, col=metadata$Time_Point, main = "PCA - Color by Time Point", xlab = "PC1", ylab = "PC2")
legend("topright", legend = levels(metadata$Time_Point), col = unique(metadata$Time_Point), pch = 19)
We have scaled all the values of our data and we have done again all the corresponding plots. We are not able to observe a big difference between not scale and scale but there is some. We continue having the clusters as before and the outlier is still there.
Then, we want to take out the outlier of our data. As we know, we only have one sample that is an outlier, so in order to take it out, we will do it by eliminating the corresponding row. As we know which sample name corresponds to the outlier (B.t1_rep3) we only have to delete it. We have to take into account that in the metadata is also included so we have also to eliminate from there.
Take outliers:
dim(X)
## [1] 48 7126
X <- X[-31, , drop = FALSE]
metadata <- metadata[-31, , drop = FALSE]
dim(X)
## [1] 47 7126
Plot with outliers:
plot(pca_scaled$x, pch=19, col=metadata$Replicate, main = "PCA - Color by Replicate", xlab = "PC1", ylab = "PC2")
legend("topright", legend = levels(metadata$Replicate), col = unique(metadata$Replicate), pch = 19)
pca_res <- prcomp(X, scale=FALSE)
autoplot(pca_res, data=metadata, colour="Strain", title = "PCA - Color by Strain") + theme_classic()
autoplot(pca_res, data=metadata, colour="Replicate", title = "PCA - Color by Replicate") + theme_classic()
autoplot(pca_res, data=metadata, colour="Time_Point", title = "PCA - Color by Time Point") + theme_classic()
pca_scaled <- prcomp(X, scale = TRUE)
autoplot(pca_scaled, data=metadata, colour="Strain", title = "PCA - Color by Strain") + theme_classic()
autoplot(pca_scaled, data=metadata, colour="Replicate", title = "PCA - Color by Replicate") + theme_classic()
autoplot(pca_scaled, data=metadata, colour="Time_Point", title = "PCA - Color by Time Point") + theme_classic()
After we take the outlier, we have done the plots again and know we are able to observe only the important data. In that way, that outlier cannot distort the results and conclusions drawn from the data.
Understanding the batch effect
plot(metadata$Total_Expression, type ="h", col=metadata$Strain)
plot(metadata$Total_Expression, type ="h", col=metadata$Replicate)
ggplot(as.data.frame(pca_scaled$x), aes(x = PC1, y =PC2, color = metadata$Total_Expression)) +
geom_point() +
scale_color_gradient(name = "Total expression", low = "green", high = "red") +
theme_classic()
ggplot(data = metadata, aes(x=rownames(X), y=Total_Expression, fill=Strain)) +
geom_bar(stat='identity') +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Row names")
ggplot(data = metadata, aes(x=rownames(X), y=Total_Expression, fill=Replicate)) +
geom_bar(stat='identity') +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Row names")
ggplot(data = metadata, aes(x=rownames(X), y=Total_Expression, fill=Time_Point)) +
geom_bar(stat='identity') +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Row names")
ggplot(data = metadata, aes(x=rownames(X), y=Total_Expression, fill = Total_Expression)) +
geom_bar(stat='identity') +
theme_classic() +
scale_fill_gradient(low="green", high = "red") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x="Row names")
We need to find a variable that highly correlates with expression. We check for the total gene expression per sample. Replicates are very different but we observe that Strain B has higher total expression. Moreover, specially Replicates 4 are the ones with the higher total expression. Once we know this, we observe that this samples are mostly on the right down of the plot. This means that a sample that is on the left of PC1 will have low expression. Symmetrically if a sample is on the right it will have high expression of all genes. We can also check for significant correlations.
Check if there are significant correlations:
plot(rowSums(X), pca_scaled$x[,1])
pca_points <- as_tibble(pca_scaled$x) %>% bind_cols(metadata) %>% as.data.frame()
pca_points$exprs <- rowSums(X)
pc1_mod <- lm(PC1 ~ exprs, pca_points)
summary(pc1_mod)
##
## Call:
## lm(formula = PC1 ~ exprs, data = pca_points)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.939 -9.300 2.846 7.636 31.594
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.045e+02 3.839e+00 -27.21 <2e-16 ***
## exprs 5.242e-07 1.539e-08 34.05 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.83 on 45 degrees of freedom
## Multiple R-squared: 0.9626, Adjusted R-squared: 0.9618
## F-statistic: 1159 on 1 and 45 DF, p-value: < 2.2e-16
We know there is correlation between PC1 and total expression because the p-value is very low.
autoplot(pca_scaled, data=metadata, colour= "Total_Expression", shape="Replicate") +
theme_classic() +
scale_color_gradient(low="green", high = "red")
We observe that now, there is no batch effect between replicates because in all clusters we have samples from different replicates.
Y = X/rowSums(X)
pca_norm <- prcomp(Y, scale=TRUE)
autoplot(pca_norm, data=metadata, colour="Strain") + theme_classic()
autoplot(pca_norm, data=metadata, colour="Replicate") + theme_classic()
autoplot(pca_norm, data=metadata, colour="Time_Point") + theme_classic()
plot(pca_norm$x, pch=19, col=metadata$Strain)
legend("topright", legend = levels(metadata$Strain), col = 1:length(levels(metadata$Strain)), pch = 19)
plot(pca_norm$x, pch=19, col=metadata$Replicate)
legend("bottomright", legend = levels(metadata$Replicate), col = 1:length(levels(metadata$Replicate)), pch = 19, cex = 0.8)
plot(pca_norm$x, pch=19, col=metadata$Time_Point)
legend("bottomright", legend = levels(metadata$Time_Point), col = 1:length(levels(metadata$Time_Point)), pch = 19, cex = 0.8)
Finally, we obtained the plots after normalizing the data and we are able to observe the differences between raw data and this new plots. While with raw data the the values where spread horizontally in the normalized data are vertically having the strain A in the PC1 negative and strain B in PC1 positive. Now, the replicates are not clustered with the same number of replicate instead they are mixed with the other replicates assuring us the batch effect is correct.
QUESTION #3: tSNE representation - Represent the data with a tSNE projection: raw data and normalized. - Again, argue whether batch effects are present between the 4 replicates. - Remove outliers if necessary. Scale or normalize data appropriately. - Show the different steps in the process and comparison with raw-data and normalized.
library(Rtsne)
# Raw data
tsne1 <- Rtsne(X, perplexity = 15)
plot(tsne1$Y, col = metadata$Strain, main = "tSNE 2D plot: Coloured by Strain")
legend("topright", pch=1, col=unique(metadata$Strain), legend = levels(metadata$Strain))
plot(tsne1$Y, col = metadata$Time_Point, main = "tSNE 2D plot: Coloured by Time_point")
legend("topright", pch=1, col=unique(metadata$Time_Point), legend = levels(metadata$Time_Point))
plot(tsne1$Y, col = metadata$Replicate, main = "tSNE 2D plot: Coloured by Replicate")
legend("topright", pch=1, col=unique(metadata$Replicate), legend = levels(metadata$Replicate))
tsne_plot <- data.frame(x = tsne1$Y[,1],
y = tsne1$Y[,2],
col = metadata$Strain)
ggplot(tsne_plot) +
geom_point(aes(x=x, y=y, color=metadata$Strain, shape = metadata$Replicate)) +
theme_classic() +
labs(title = "tSNE 2D plot: Strain vs Replicate", x = "tSNE1", y = "tSNE2", color = "Strain", shape = "Replicate") +
scale_shape_manual(values = c(3, 15, 16, 17))
ggplot(tsne_plot) +
geom_point(aes(x=x, y=y, color=metadata$Strain, shape = metadata$Time_Point)) +
theme_classic() +
labs(title = "tSNE 2D plot: Strain vs Time_point", x = "tSNE1", y = "tSNE2", color = "Strain", shape = "Time_point") +
scale_shape_manual(values = c(3, 15, 16, 17, 18, 8))
X <- X[-31, , drop = FALSE]
metadata <- metadata[-31, , drop = FALSE]
X=X[, colSums(X)>0]
Y = X/rowSums(X)
tsne_norm <- Rtsne(Y, pca = TRUE, perplexity = 15)
plot(tsne_norm$Y, col = metadata$Strain, main = "tSNE 2D plot: Coloured by Strain")
legend("topright", pch=1, col=unique(metadata$Strain), legend = levels(metadata$Strain))
plot(tsne_norm$Y, col = metadata$Time_Point, main = "tSNE 2D plot: Coloured by Time_point")
legend("topright", pch=1, col=unique(metadata$Time_Point), legend = levels(metadata$Time_Point))
plot(tsne_norm$Y, col = metadata$Replicate, main = "tSNE 2D plot: Coloured by Replicate")
legend("topright", pch=1, col=unique(metadata$Replicate), legend = levels(metadata$Replicate))
tsne_plot <- data.frame(x = tsne_norm$Y[,1],
y = tsne_norm$Y[,2],
col = metadata$Strain)
ggplot(tsne_plot) +
geom_point(aes(x=x, y=y, color=metadata$Strain, shape = metadata$Replicate)) +
theme_classic() +
labs(title = "tSNE 2D plot: I", x = "tSNE1", y = "tSNE2", color = "Strain", shape = "Replicate") +
scale_shape_manual(values = c(3, 15, 16, 17))
Here we have represent the data with a tSNE projection (raw data and normalized). In the raw data we can see four different clusters for the four different replicates. And, as we can see these four clusters of the replicates, we can say that there is a batch effect in the raw data. However, when we normalize the data, we can now see that the replicates are mixed and we just see two clusters of Strain A and Strain B. Moreover, when we use the normalized data we have also removed outliers to observe directly the results.
QUESTION #4: tSNE parameters Test the effect of reproducibility, perplexity and iterations. Use normalized dataset only. Provide comments and thoughts about it.
Y = X/rowSums(X)
tsne_norm <- Rtsne(Y, pca = TRUE, perplexity = 15)
plot(tsne_norm$Y, col = metadata$Strain, main = "tSNE 2D plot: Coloured by Strain")
legend("topright", pch=1, col=unique(metadata$Strain), legend = levels(metadata$Strain))
plot(tsne_norm$Y, col = metadata$Time_Point, main = "tSNE 2D plot: Coloured by Time_point")
legend("topright", pch=1, col=unique(metadata$Time_Point), legend = levels(metadata$Time_Point))
plot(tsne_norm$Y, col = metadata$Replicate, main = "tSNE 2D plot: Coloured by Replicate")
legend("topright", pch=1, col=unique(metadata$Replicate), legend = levels(metadata$Replicate))
tsne_plot <- data.frame(x = tsne_norm$Y[,1],
y = tsne_norm$Y[,2],
col = metadata$Strain)
ggplot(tsne_plot) +
geom_point(aes(x=x, y=y, color=metadata$Strain, shape = metadata$Replicate)) +
theme_classic() +
labs(title = "tSNE 2D plot: Color vs Strain", x = "tSNE1", y = "tSNE2", color = "Strain", shape = "Replicate") +
scale_shape_manual(values = c(3, 15, 16, 17))
After running tSNE a couple times, we compare the results and we can see that the distribution of each point is not the same. This occurs because the plots have different starting points, a different origin. Although the plots are not the same, they hold the same conclusions.
# Perplexity
Y = X/rowSums(X)
set.seed(123) ## allows us to reproduce results
list_plots_pp <- list()
for (perplexity in c(1, 5, 10, 15)) {
tsne_pp <- Rtsne(Y, pca = TRUE, perplexity = perplexity)
tsne_data2 <- as.data.frame(tsne_pp$Y)
tsne_data2$Strain <- metadata$Strain
tsne_data2$Time_point <- metadata$Time_Point
tsne_data2$Replicate <- metadata$Replicate
# ggplot
plot_pp <- ggplot(tsne_data2, aes(x=V1, y=V2)) +
geom_point(aes(color=Time_point, shape=Replicate)) +
scale_shape_manual(values = c(3, 15, 16, 17)) +
ggtitle(paste0("tSNE 2D plot: perplexity value = ", perplexity)) +
labs(x = "tSNE1", y = "tSNE2") +
theme_classic()
print(plot_pp)
list_plots_pp[[paste0("perplexity", perplexity)]] = plot_pp
}
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
ggpubr::ggarrange(plotlist = list_plots_pp, common.legend = TRUE)
The maximum value of perplexity for our data set is 15 this is because the perplexity parameter should not be bigger than 3 * perplexity < nrow(X) - 1.
pp_max = nrow(X) - 1
pp_max
## [1] 46
3*16 < pp_max
## [1] FALSE
Once the perplexity number is above the value 15, 3*perplexity will be bigger than our number of rows minus one and we will get an error.
The perplexity says how to balance attention between local and global aspects of your data and it is a guess about the number of close neighbors each point has. So, each time we increase the value of perplexity, we are able to distinguish better and better the clusters.
With this data, if we set the perplexity to the default value (30) we obtain an error (as shown above) and so trying with different values, we have obtained that 15 is the maximum value of perplexity that this data can have. However, if we observe the resulting plots, we observe that the most suiting value for perplexity is 5. With perplexity 5 we can see two clusters for the two strains and in these we can see six groups for the different time points and in group the four replicates. We cannot see any batch effect.
Therefore, for the following umap plots we will be setting the perplexity value to 5.
# Maximum iterations
Y = X/rowSums(X)
set.seed(123)
list_plots_max <- list()
for (max_iter in c(1, 10, 100, 500, 1000)) {
tsne_pp <- Rtsne(Y, pca = TRUE, max_iter = max_iter, perplexity = 15)
tsne_data2 <- as.data.frame(tsne_pp$Y)
tsne_data2$Strain <- metadata$Strain
tsne_data2$Time_point <- metadata$Time_Point
tsne_data2$Replicate <- metadata$Replicate
# ggplot
plot_max <- ggplot(tsne_data2, aes(x=V1, y=V2)) +
geom_point(aes(color=Strain, shape=Replicate)) +
scale_shape_manual(values = c(3, 15, 16, 17)) +
ggtitle(paste0("tSNE 2D plot: max_iter = ", max_iter)) +
labs(x = "tSNE1", y = "tSNE2") +
theme_classic()
print(plot_max)
list_plots_max[[paste0("max_iter", max_iter)]] = plot_max
}
ggpubr::ggarrange(plotlist = list_plots_max, common.legend = TRUE)
In the case of iterations, the more iterations the better. We can see that when we do one iteration, the clusters are not visible we just see a huge clump of data points in the center of our plot. However, as we increase the number of iterations we can start to see the formations of clusters. Also, we could keep increasing the number of iterations but trying not to get to a point where it is not feasible
QUESTION #5: UMAP interpretation Create a UMAP visualization of the results. Explore differences between producing results from raw data counts or normalized results. Remove outlier and interpret the effects of it.
raw_UMAP <- umap(X, perplexity = 5)
plot(raw_UMAP$layout, main = "UMAP Representation", col = metadata$Strain, pch = 16)
legend("topright", legend = levels(metadata$Strain), col = unique(metadata$Strain), pch = 16, title = "Strain")
plot(raw_UMAP$layout, main = "UMAP Representation", col = metadata$Time_Point, pch = 16)
legend("topright", legend = levels(metadata$Time_Point), col = unique(metadata$Time_Point), pch = 16, title = "Time Point")
plot(raw_UMAP$layout, main = "UMAP Representation", col = metadata$Replicate, pch = 16)
legend("topright", legend = levels(metadata$Replicate), col = unique(metadata$Replicate), pch = 16, title = "Replicate")
raw_UMAP_plot <- data.frame(x = raw_UMAP$layout[,1], y = raw_UMAP$layout[,2], color = metadata$Replicate)
ggplot(data = raw_UMAP_plot) +
geom_point(aes(x = x, y = y, color = color, shape = metadata$Strain)) +
labs(title = "UMAP Representation") +
scale_color_discrete(name = "Replicate") +
scale_shape_manual(values = c(3, 19), name = "Strain") +
theme_classic()
We have done different plots using UMAP with raw data. We have colored each plot with a different variable to observe how the samples using UMAP differs between Strains, Replicates and Time Points. We observe a trend where samples are distributed from the negative x-axis and increases up to positive values in the y-axis. We are able to observe some clusters between the same Strain, same Replicate and same Time Point, even though we have an outlier of Strain B, Replicate 3 and Time Point 3. This outlier is situated in the negative x-axis between -2 and -3. We can assure there is batch effect because the clusters are separated between Strains, in order to don’t have batch effect we should have in the same cluster both strains.
X= X[, colSums(X)>0]
Y = X/rowSums(X)
norm_dataSet_UMAP <- umap::umap(Y, perplexity = 5)
plot(norm_dataSet_UMAP$layout, main = "UMAP Representation", col = metadata$Strain, pch = 16)
legend("topleft", legend = levels(metadata$Strain), col = unique(metadata$Strain), pch = 16, title = "Strain")
plot(norm_dataSet_UMAP$layout, main = "UMAP Representation", col = metadata$Time_Point, pch = 16)
legend("topleft", legend = levels(metadata$Time_Point), col = unique(metadata$Time_Point), pch = 16, cex = 0.75, title = "Time Point")
plot(norm_dataSet_UMAP$layout, main = "UMAP Representation", col = metadata$Replicate, pch = 16)
legend("topleft", legend = levels(metadata$Replicate), col = unique(metadata$Replicate), pch = 16, title = "Replicate")
norm_UMAP_plot <- data.frame(x = norm_dataSet_UMAP$layout[,1], y = norm_dataSet_UMAP$layout[,2], color = metadata$Replicate)
ggplot(data = norm_UMAP_plot) +
geom_point(aes(x = x, y = y, color = color, shape = metadata$Strain)) +
labs(title = "UMAP Representation") +
scale_color_discrete(name = "Replicate") +
scale_shape_manual(values = c(3, 19), name = "Strain") +
theme_classic()
norm_UMAP_plot <- data.frame(x = norm_dataSet_UMAP$layout[,1], y = norm_dataSet_UMAP$layout[,2], color = metadata$Total_Expression)
ggplot(data = norm_UMAP_plot) +
geom_point(aes(x = x, y = y, color = color, shape = metadata$Strain)) +
labs(title = "UMAP Representation") +
scale_color_gradient(low="green", high = "red") +
theme_classic()
…
Take outliers:
dim(X)
## [1] 48 7126
X <- X[-31, , drop = FALSE]
metadata <- metadata[-31, , drop = FALSE]
dim(X)
## [1] 47 7126
X= X[, colSums(X)>0]
Y = X/rowSums(X)
fumap <- umap(Y, perplexity = 5)
plot(fumap$layout, main = "UMAP Representation", col = metadata$Strain, pch = 16)
legend("bottomright", legend = levels(metadata$Strain), col = unique(metadata$Strain), pch = 16, title = "Strain")
plot(fumap$layout, main = "UMAP Representation", col = metadata$Time_Point, pch = 16)
legend("topleft", legend = levels(metadata$Time_Point), col = unique(metadata$Time_Point), pch = 16, cex = 0.75, title = "Time Point")
plot(fumap$layout, main = "UMAP Representation", col = metadata$Replicate, pch = 16)
legend("topleft", legend = levels(metadata$Replicate), col = unique(metadata$Replicate), pch = 16, title = "Replicate")
final_UMAP_plot <- data.frame(x = fumap$layout[,1], y = fumap$layout[,2], color = metadata$Replicate)
ggplot(data = final_UMAP_plot) +
geom_point(aes(x = x, y = y, color = color, shape = metadata$Strain)) +
labs(title = "UMAP Representation") +
scale_color_discrete(name = "Replicate") +
scale_shape_manual(values = c(3, 19), name = "Strain") +
theme_classic()
final_UMAP_plot <- data.frame(x = fumap$layout[,1], y = fumap$layout[,2], color = metadata$Total_Expression)
ggplot(data = final_UMAP_plot) +
geom_point(aes(x = x, y = y, color = color, shape = metadata$Strain)) +
labs(title = "UMAP Representation") +
scale_color_gradient(low="green", high = "red") +
theme_classic()
…
QUESTION #6: Predict the data. Explore the effect of predicting a new sample result from previous UMAP analysis. Compare results and give some thoughts on it.
X1_X2_X3 <- X[0:35,]
X1_X2_X3 = X1_X2_X3/rowSums(X1_X2_X3)
dataSet_UMAP <- umap::umap(X1_X2_X3)
#names(p1_dataSet_UMAP)
#dim(p1_dataSet_UMAP$data)
UMAP_X1_X2_X3 <- data.frame(x = dataSet_UMAP$layout[,1],
y = dataSet_UMAP$layout[,2],
col = metadata$Time_Point[0:35],
shape = metadata$Replicate[0:35])
ggplot(UMAP_X1_X2_X3) +
geom_point(aes(x=x, y=y, color=col, shape=shape)) +
theme_classic() +
labs(title = "Normalized X1, X2, X3 data UMAP", color = "Time point", shape = "Replicate") +
scale_shape_manual(values = c(3, 17, 15, 19))
X1_X2_X3 <- X[0:35,]
X1_X2_X3 = X1_X2_X3/rowSums(X1_X2_X3)
dataSet_UMAP <- umap::umap(X1_X2_X3)
#names(p1_dataSet_UMAP)
#dim(p1_dataSet_UMAP$data)
UMAP_X1_X2_X3 <- data.frame(x = dataSet_UMAP$layout[,1],
y = dataSet_UMAP$layout[,2],
col = metadata$Total_Expression[0:35],
shape = metadata$Replicate[0:35])
ggplot(UMAP_X1_X2_X3) +
geom_point(aes(x=x, y=y, color=col, shape=shape)) +
theme_classic() +
labs(title = "Normalized X1, X2, X3 data UMAP", color = "Time point", shape = "Replicate") +
scale_color_gradient(low="green", high = "red") +
scale_shape_manual(values = c(3, 17, 15, 19))
We performed a UMAP analysis on the first three replicates and in this plot and we cannot see any batch effect. We have are all three replicates for each time point in the two clusters for both strains. The clusters show variability on the total expression.
X4 = X4/rowSums(X4)
dataset_UMAP_x4 = stats::predict(dataSet_UMAP, X4)
# ggplot
UMAP_plot_predicted <- data.frame(x = dataset_UMAP_x4[,1],
y = dataset_UMAP_x4[,2],
col = metadata$Time_Point[36:47],
shape = metadata$Replicate[36:47])
ggplot(UMAP_plot_predicted) +
geom_point(aes(x=x, y=y, color=col, shape=shape)) +
theme_classic() +
labs(title = "UMAP analysis: Predicted X4", color = "Time point", shape = "Replicate")
X4 = X4/rowSums(X4)
dataset_UMAP_x4 = stats::predict(dataSet_UMAP, X4)
# ggplot
UMAP_plot_predicted <- data.frame(x = dataset_UMAP_x4[,1],
y = dataset_UMAP_x4[,2],
col = metadata$Total_Expression[36:47],
shape = metadata$Replicate[36:47])
ggplot(UMAP_plot_predicted) +
geom_point(aes(x=x, y=y, color=col, shape=shape)) +
theme_classic() +
labs(title = "UMAP analysis: Predicted X4", color = "Time point", shape = "Replicate") +
scale_color_gradient(low="green", high = "red")
With the previous UMAP analysis on X1, X2 and X3, we predicted a new sample result on X4. In this prediction we did not loose the clustering of the samples according to the strain so, we could say that this is an efficient prediction and that there is no batch effect.
Moreover, this prediction helps us understand the relationship with the previously analyzed data. We can see that the plot of X1, X2, and X3 has two clusters tightly packed and the plot of X4 also has two clusters but the samples are further from each other. This could suggest that X4 has more variability in the samples than the previous datasets and it might also suggest that the features in X4 and the embeddings in the UMAP space might not align as closely as with X1, X2, and X3.
QUESTION #7: Explore parameters. Explore parameters for the UMAP algorithm: number of neighbors, minimum distance and a third parameter of your choice.
Y = X/rowSums(X)
custom.umap.config <- umap::umap.defaults
list_plots_neighbors <- list()
for (n_neighbors in c(2, 5, 15)) {
custom.umap.config$n_neighbors = n_neighbors
umap_n <- umap(Y, config = custom.umap.config)
umap_data <- data.frame(x = umap_n$layout[,1],
y = umap_n$layout[,2],
col = metadata$Time_Point,
shape = metadata$Replicate)
# ggplot
plot_n <- ggplot(umap_data) +
geom_point(aes(x=x, y=y, color=col, shape=shape)) +
scale_shape_manual(values = c(3, 17, 15, 19)) +
ggtitle(paste0("UMAP: n_neighbors = ", n_neighbors)) +
labs(color = "Time point", shape = "Replicate") +
theme_classic()
print(plot_n)
list_plots_neighbors[[paste0("n_neighbors", n_neighbors)]] = plot_n
}
library(gridExtra)
ggpubr::ggarrange(plotlist = list_plots_neighbors, common.legend = TRUE) #print only 4
custom.umap.config <- umap::umap.defaults
list_plots_min <- list()
for (min_dist in c(0.1, 0.5, 0.9)) {
custom.umap.config$min_dist = min_dist
umap_min <- umap(Y, config = custom.umap.config)
umap_data <- data.frame(x = umap_min$layout[,1],
y = umap_min$layout[,2],
col = metadata$Time_Point,
shape = metadata$Replicate)
# ggplot
plot_min <- ggplot(umap_data) +
geom_point(aes(x=x, y=y, color=col, shape=shape)) +
scale_shape_manual(values = c(3, 17, 15, 19)) +
ggtitle(paste0("UMAP: min_dist = ", min_dist)) +
labs(color = "Time point", shape = "Replicate") +
theme_classic()
print(plot_min)
list_plots_min[[paste0("min_dist", min_dist)]] = plot_min
}
ggpubr::ggarrange(plotlist = list_plots_min, common.legend = TRUE)
custom.umap.config <- umap::umap.defaults
list_plots_metric <- list()
for (metric in c("euclidean", "cosine", "manhattan")) {
custom.umap.config$metric = metric
umap_metric <- umap(Y, config = custom.umap.config)
umap_data <- data.frame(x = umap_metric$layout[,1],
y = umap_metric$layout[,2],
col = metadata$Time_Point,
shape = metadata$Replicate)
# ggplot
plot_metric <- ggplot(umap_data) +
geom_point(aes(x=x, y=y, color=col, shape=shape)) +
scale_shape_manual(values = c(3, 17, 15, 19)) +
ggtitle(paste0("UMAP: metric = ", metric)) +
labs(color = "Time point", shape = "Replicate") +
theme_classic()
print(plot_metric)
list_plots_metric[[paste0("metric", metric)]] = plot_metric
}
ggpubr::ggarrange(plotlist = list_plots_metric, common.legend = TRUE)
The ‘n_neighbors’ parameter in UMAP controls the number of neighbors used in the construction of the graph and this will affect on the global structure of the UMAP. If we look at the plots of the neighbors parameters, we can clearly see that the best value for this parameter is 15. With this higher value we can see more packed and defined clusters than with the other values. With this high value we will see a moderate local connectivity and a representation of clusters that captures better the global structure.
The ‘min_dist’ parameter in UMAP sets the minimum distance between points. We can see that smaller ‘min_dist’ packs tightly the values leading to more emphasis on local structures but risking overlap between clusters. However, larger values of ‘min_dist’ separates more the points and preserves global structures but losing local details. For our data we believe that we should have a balance, a ‘min_dist’ that works well in capturing the local and global structures, for that we believe that we should choose ‘min_dist’ to be 0.1.
Finally, the ‘metric’ parameter. This parameter controls how distance is computed in the ambient space of the input data. For this we have choosen “euclidean” and “manhattan”, Minkowski style metrics and “cosine”, an angular and correlation metric. The euclidean metric measures the straight-line distance between points in a Euclidean space, the manhattan metric computes the distance by summing the absolute differences between coordinates and the cosine metric measures the cosine of the angle between two vectors.
We can see that all these metrics in our data create two clusters and these contain the four replicates and the six time points so, there is not batch effect. All three metric are pretty similar but we will choose the euclidean metric for the nature of our data.
QUESTION #8: Final interpretation Create a final visualization using results from PCA, tSNE and UMAP and write some thoughts on the comparison of the three high-dimension techniques. Add some thoughts on the difference between techniques regarding the outlier. Use as many metadata available as possible and generate clear and easy to interpret plots. Provide comments and thoughts about it. Do you get to the same conclusions?
ggplot(as.data.frame(pca_norm$x), aes(x = PC1, y = PC2, color = metadata$Strain, shape = metadata$Replicate)) +
geom_point() +
theme_minimal() +
labs(color = "Strain", shape = "Replicate", title = "PCA Normalized Data")
ggplot(as.data.frame(pca_norm$x), aes(x = PC1, y = PC2, color = metadata$Strain, shape = metadata$Time_Point)) +
geom_point() +
theme_minimal() +
labs(color = "Strain", shape = "Time Point", title = "PCA Normalized Data")
pca_plot <- autoplot(pca_norm, data=metadata, color="Total_Expression", shape="Strain") + theme_classic() +
scale_color_gradient(low="green", high = "red")
pca_plot
After proper normalization the PC1 separates the two different strains of sherry wines. This value suggests that there are consistent genetic differences between the yeast strains. For the case of PC2, we are able to see a progression of time points from positive to negative, which aligns with the developmental stages of velum formation. Observing the replicates, we can only say that we achieve plots where we don’t have batch effects as we have samples from the different replicates in all clusters.
set.seed(123)
Y = X/rowSums(X)
tsne_norm <- Rtsne(Y, pca = TRUE, perplexity = 5)
tsne_plot <- data.frame(x = tsne_norm$Y[,1],
y = tsne_norm$Y[,2],
col = metadata$Strain)
ggplot(tsne_plot) +
geom_point(aes(x=x, y=y, color=metadata$Strain, shape = metadata$Replicate)) +
theme_classic() +
labs(title = "tSNE 2D plot: Strain vs Replicate" , x = "tSNE1", y = "tSNE2", color = "Strain", shape = "Replicate") +
scale_shape_manual(values = c(3, 15, 16, 17))
set.seed(123)
ggplot(tsne_plot) +
geom_point(aes(x=x, y=y, color=metadata$Strain, shape = metadata$Time_Point)) +
theme_classic() +
labs(title = "tSNE 2D plot: Strain vs Time Point", x = "tSNE1", y = "tSNE2", color = "Strain", shape = "Time Point") +
scale_shape_manual(values = c(3, 15, 16, 17, 18, 8))
set.seed(123)
tsne_plot <- ggplot(tsne_plot) +
geom_point(aes(x=x, y=y, color=metadata$Total_Expression, shape = metadata$Strain)) +
theme_classic() +
scale_color_gradient(low="green", high = "red") +
labs(title = "tSNE 2D plot: Strain vs Total expression", x = "tSNE1", y = "tSNE2", color = "Total expression", shape = "Strain") +
scale_shape_manual(values = c(3, 17))
tsne_plot
Additionally, with the representation in the tSNE plots, we can say that the strains of sherry wines are separated from each other showing again that there are genetic differences between the yeast strains. Moreover, we can also see that we got rid of the batch effect as we see the replicates mixed in the two clusters.
final_UMAP_plot <- data.frame(x = fumap$layout[,1], y = fumap$layout[,2], color = metadata$Replicate)
ggplot(data = final_UMAP_plot) +
geom_point(aes(x = x, y = y, color = color, shape = metadata$Strain)) +
labs(title = "UMAP Representation") +
scale_color_discrete(name = "Replicate") +
scale_shape_manual(values = c(3, 19), name = "Strain") +
theme_classic()
final_UMAP_plot <- data.frame(x = fumap$layout[,1], y = fumap$layout[,2], color = metadata$Time_Point)
ggplot(data = final_UMAP_plot) +
geom_point(aes(x = x, y = y, color = color, shape = metadata$Strain)) +
labs(title = "UMAP Representation") +
scale_color_discrete(name = "Time Point") +
scale_shape_manual(values = c(3, 19), name = "Strain") +
theme_classic()
final_UMAP_plot <- data.frame(x = fumap$layout[,1], y = fumap$layout[,2], color = metadata$Time_Point)
ggplot(data = final_UMAP_plot) +
geom_point(aes(x = x, y = y, color = color, shape = metadata$Replicate, alpha = metadata$Strain)) +
labs(title = "UMAP Representation") +
scale_color_discrete(name = "Time Point") +
scale_alpha_manual(values = c(0.5, 1.0), name = "Strain") +
scale_shape_manual(values = c(3, 4, 17, 19), name = "Replicate") +
theme_classic()
final_UMAP_plot <- data.frame(x = fumap$layout[,1], y = fumap$layout[,2], color = metadata$Total_Expression)
umap_plot <- ggplot(data = final_UMAP_plot, Treatment = metadata$Strain) +
geom_point(aes(x = x, y = y, color = color, shape = metadata$Replicate, alpha = metadata$Strain)) +
labs(title = "UMAP Representation", color = "Total Expression") +
scale_shape_manual(values = c(3, 4, 17, 19), name = "Replicate") +
scale_alpha_manual(values = c(0.5, 1.0), name = "Strain") +
scale_color_gradient(name = "Total expression", low = "green", high = "red") +
theme_classic()
umap_plot
library(gridExtra)
grid.arrange(pca_plot, tsne_plot, umap_plot, nrow = 3)
Finally, in both our PCA plot and our tSNE plot we can observe two different clusters for the strains. However, in the PCA plot we can see the samples distributed horizontally and more separated whereas in the tSNE plot, we see these two clusters but the samples are clustered tightly on opposite corners. Even tough we do this observation, we obtain the same conclusion that here are genetic differences between sherry wine yeast strains that might be reflected in the wine tasting.